Dataset downloaded from mgandal’s github repository.
# Load csvs
datExpr = read.csv('./../Data/Gandal/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../Data/Gandal/RNAseq_ASD_datMeta.csv')
# Group brain regions by lobes
datMeta$Brain_Region = as.factor(datMeta$Region)
datMeta$Brain_lobe = 'Occipital'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA4_6', 'BA9', 'BA24', 'BA44_45')] = 'Frontal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA3_1_2_5', 'BA7')] = 'Parietal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA38', 'BA39_40', 'BA20_37', 'BA41_42_22')] = 'Temporal'
datMeta$Brain_lobe=factor(datMeta$Brain_lobe, levels=c('Frontal', 'Temporal', 'Parietal', 'Occipital'))
# Remove '/' from Batch variable: (It is recommended (but not required) to use only letters, numbers,
# and delimiters '_' or '.', in levels of factors as these are safe characters for column names in R
datMeta$Batch = gsub('/', '.', datMeta$RNAExtractionBatch) %>% as.factor
# Transform Diagnosis into a factor variable
datMeta$Diagnosis_ = factor(datMeta$Diagnosis_, levels=c('CTL','ASD'))
Data description taken from the dataset’s synapse entry: RNAseq data was generated from 88 postmortem cortex brain samples from subjects with ASD (53 samples from 24 subjects) and non-psychiatric controls (35 samples from 17 subjects), across four cortical regions encompassing all major cortical lobes – frontal, temporal, parietal, and occipital. Brain samples were obtained from the Harvard Brain Bank as part of the Autism Tissue Project (ATP).
print(paste0('Dataset includes ', nrow(datExpr), ' probes from ', ncol(datExpr), ' samples belonging to ', length(unique(datMeta$Subject_ID)), ' different subjects.'))
## [1] "Dataset includes 63682 probes from 88 samples belonging to 41 different subjects."
Diagnosis distribution: There are more ASD samples than controls
table(datMeta$Diagnosis_)
##
## CTL ASD
## 35 53
Brain region distribution: All regions seem to be balanced
table(datMeta$Brain_lobe)
##
## Frontal Temporal Parietal Occipital
## 22 20 23 23
Diagnosis and brain region seem to be balanced except for the frontal lobe, where there are more control samples than ASD ones.
table(datMeta$Diagnosis_, datMeta$Brain_lobe)
##
## Frontal Temporal Parietal Occipital
## CTL 13 6 8 8
## ASD 9 14 15 15
Sex distribution: There are many more Male samples than Female ones
table(datMeta$Sex)
##
## F M
## 15 73
Age distribution: Subjects between 5 and 60 years old with a mean close to 30
summary(datMeta$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 17.00 28.00 29.74 41.75 60.00
1. Filter probes with start or end position missing
to_keep = !is.na(datProbes$length)
datProbes = datProbes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datProbes) = datProbes$ensembl_gene_id
print(paste0('Removed ', sum(!to_keep), ', ', sum(to_keep), ' remaining'))
## [1] "Removed 5, 63677 remaining"
2. Filter probes with low expression levels
\(\qquad\) 2.1 Remove probes with zero expression in all of the samples
to_keep = rowSums(datExpr)>0
datProbes = datProbes[to_keep,]
datExpr = datExpr[to_keep,]
print(paste0('Removed ', sum(!to_keep), ', ', sum(to_keep), ' remaining'))
## [1] "Removed 13720, 49957 remaining"
\(\qquad\) 2.2 Removing probes with a mean expression lower than 0.5
plot_data = data.frame('id'=rownames(datExpr), 'mean_expression' = rowMeans(datExpr))
ggplotly(plot_data %>% ggplot(aes(x=mean_expression)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
geom_vline(xintercept=0.5, color='gray') + scale_x_log10() +
ggtitle('Probe Mean Expression distribution') + theme_minimal())
to_keep = rowMeans(datExpr)>0.5
datProbes = datProbes[to_keep,]
datExpr = datExpr[to_keep,]
print(paste0('Removed ', sum(!to_keep), ', ', sum(to_keep), ' remaining'))
## [1] "Removed 16764, 33193 remaining"
3. Filter outlier samples
\(\qquad\) 3.1 Gandal filters samples belonging to subject AN03345 without giving an explanation. Since it could have some technical problems, I remove them as well
to_keep = (datMeta$Subject_ID != 'AN03345')
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
print(paste0('Removed ', sum(!to_keep), ', ', sum(to_keep), ' remaining'))
## [1] "Removed 2, 86 remaining"
\(\qquad\) 3.2 Filter out outliers: Using node connectivity as a distance measure, normalising it and filtering out genes farther away than 2 standard deviations from the left (lower connectivity than average, not higher)
Gandal uses the formula \(s_{ij}=\frac{1+bw(i,j)}{2}\) to convert all the weights to positive values, but I used \(s_{ij}=|bw(i,j)|\) instead because I think it makes more sense. In the end it doesn’t matter because they select as outliers the same six samples
Outliers don’t seem to have any characterstic in common (different subjects, extraction batches, brain lobes, age, PMI), except for diagnosis and sex, although sex could be just because the sex bias in the dataset
absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))
plot_data = data.frame('sample'=1:length(z.ku), 'distance'=z.ku, 'Sample_ID'=datMeta$Sample_ID,
'Subject_ID'=datMeta$Subject_ID, 'Extraction_Batch'=datMeta$RNAExtractionBatch,
'Brain_Lobe'=datMeta$Brain_lobe, 'Sex'=datMeta$Sex, 'Age'=datMeta$Age,
'Diagnosis'=datMeta$Diagnosis_, 'PMI'=datMeta$PMI)
selectable_scatter_plot(plot_data, plot_data[,-c(1,2)])
print(paste0('Outlier samples: ', paste(as.character(plot_data$Sample_ID[plot_data$distance< -2]), collapse=', ')))
## [1] "Outlier samples: AN01971_BA38, AN17254_BA17, AN09714_BA38, AN01093_BA7, AN02987_BA17, AN11796_BA7"
to_keep = abs(z.ku)<2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
print(paste0('Removed ', sum(!to_keep), ', ', sum(to_keep), ' remaining'))
## [1] "Removed 6, 80 remaining"
rm(absadj, netsummary, ku, z.ku, plot_data, to_keep)
print(paste0('After filtering, the dataset consists of ', nrow(datExpr), ' probes and ', ncol(datExpr), ' samples'))
## [1] "After filtering, the dataset consists of 33193 probes and 80 samples"
According to Tackling the widespread and critical impact of batch effects in high-throughput data, technical artifacts can be an important source of variability in the data, so batch correction should be part of the standard preprocessing pipeline of gene expression data.
They say Processing group and Date of the experiment are good batch surrogates, so I’m going to see if they affect the data in any clear way to use them as surrogates.
All the information we have is the Brain Bank, and although all the samples were obtained from the Autism Tissue Project, we don’t have any more specific information about who preprocessed each sample
table(datMeta$Brain_Bank)
##
## ATP
## 80
There are two different dates when the data was procesed
table(datMeta$RNAExtractionBatch)
##
## 10/10/2014 6/20/2014
## 53 27
Luckily, there doesn’t seem to be a correlation between the batch surrogate and the objective variable, so the batch effect will not get confused with the Diagnosis effect
table(datMeta$RNAExtractionBatch, datMeta$Diagnosis_)
##
## CTL ASD
## 10/10/2014 24 29
## 6/20/2014 11 16
*All the samples from each subject were processed on the same day (makes sense, otherwise they wound need to freeze the samples)
Samples don’t seem to cluster together that strongly for each batch, although there does seem to be some kind of relation
h_clusts = datExpr %>% t %>% dist %>% hclust %>% as.dendrogram
create_viridis_dict = function(){
min_age = datMeta$Age %>% min
max_age = datMeta$Age %>% max
viridis_age_cols = viridis(max_age - min_age + 1)
names(viridis_age_cols) = seq(min_age, max_age)
return(viridis_age_cols)
}
viridis_age_cols = create_viridis_dict()
dend_meta = datMeta[match(substring(labels(h_clusts),2), datMeta$Dissected_Sample_ID),] %>%
mutate('Batch' = ifelse(RNAExtractionBatch=='10/10/2014', '#F8766D', '#00BFC4'),
'Diagnosis' = ifelse(Diagnosis_=='CTL','#008080','#86b300'), # Blue control, Green ASD
'Sex' = ifelse(Sex=='F','#ff6666','#008ae6'), # Pink Female, Blue Male
'Region' = case_when(Brain_lobe=='Frontal'~'#F8766D', # ggplot defaults for 4 colours
Brain_lobe=='Temporal'~'#7CAE00',
Brain_lobe=='Parietal'~'#00BFC4',
Brain_lobe=='Occipital'~'#C77CFF'),
'Age' = viridis_age_cols[as.character(Age)]) %>% # Purple: young, Yellow: old
dplyr::select(Age, Region, Sex, Diagnosis, Batch)
h_clusts %>% set('labels', rep('', nrow(datMeta))) %>% set('branches_k_color', k=9) %>% plot
colored_bars(colors=dend_meta)
There seems to be a different behaviour by batch mainly in the first and third principal components
pca = datExpr %>% t %>% prcomp
summary(pca)$importance[,1:3]
## PC1 PC2 PC3
## Standard deviation 273215.31037 171051.12817 98195.28432
## Proportion of Variance 0.57415 0.22504 0.07416
## Cumulative Proportion 0.57415 0.79920 0.87336
plot_data = data.frame('ID'=colnames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'PC3' = pca$x[,3]) %>%
mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
mutate('Batch'=RNAExtractionBatch) %>% dplyr::select('PC1','PC2','PC3','Batch')
plot_data %>% ggpairs(progress=FALSE, aes(colour=Batch, fill=Batch, alpha=0.3)) + theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
rm(pca, plot_data)
Comparing the mean expression of each sample by batch we can see the batch effect is reflected here as well
plot_data_b1 = data.frame('Mean'=colMeans(datExpr[,datMeta$RNAExtractionBatch=='10/10/2014']), 'Batch'='10/10/2014')
plot_data_b2 = data.frame('Mean'=colMeans(datExpr[,datMeta$RNAExtractionBatch=='6/20/2014']), 'Batch'='6/20/2014')
plot_data = rbind(plot_data_b1, plot_data_b2)
mu = plot_data %>% group_by(Batch) %>% dplyr::summarise(BatchMean=mean(Mean))
ggplotly(plot_data %>% ggplot(aes(x=Mean, color=Batch, fill=Batch)) + geom_density(alpha=0.3) +
geom_vline(data=mu, aes(xintercept=BatchMean, color=Batch), linetype='dashed') +
ggtitle('Mean expression by sample grouped by Batch') + scale_x_log10() + theme_minimal())
rm(plot_data_b1, plot_data_b2, plot_data)
In conclusion: Date of extraction works as a surrogate for batch effect
Using DESeq2 package to perform normalisation. Chose this package over limma because limma uses the log transformed data as input instead of the raw counts and I have discovered that in this dataset, this transformation affects probes differently depending on their mean expression level, and genes with a high SFARI score are specially affected by this.
plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr), 'SD'=apply(datExpr,1,sd))
plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.1) + geom_abline(color='gray') +
scale_x_log10() + scale_y_log10() + theme_minimal()
rm(plot_data)
Using vst instead of rlog to perform normalisation. Bioconductor question explaining differences between methods where Michael Love (author of DESEq2) recommends using vst.
Performing batch correction by adding the batch variable in the design formula of the DESeqDataSet object
Including a log fold change of log2(1.2) in the results formula to correct the null hypothesis \(H_0:|lfc|\leq\theta\) instead of \(H0:lfc=0\) because that’s the hypothesis we are interested in
counts = datExpr %>% as.matrix
rowRanges = GRanges(datProbes$chromosome_name,
IRanges(datProbes$start_position, width=datProbes$length),
strand=datProbes$strand,
feature_id=datProbes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
dds = DESeqDataSet(se, design = ~ Batch + Diagnosis_) # Including batch effect
# Perform DEA
dds = DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 177 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
DE_info = results(dds, lfcThreshold=log2(1.2), altHypothesis='greaterAbs')
# Perform vst
vsd = vst(dds)
datExpr_vst = assay(vsd)
datMeta_vst = colData(vsd)
datProbes_vst = rowRanges(vsd)
rm(counts, rowRanges, se, dds, vsd)
Using the plotting function DESEq2’s manual proposes to study vst’s output it looks like the data could be homoscedastic
meanSdPlot(datExpr_vst, plot=FALSE)$gg + theme_minimal()
When plotting point by point it seems like the probes with the lowest values behave differently
plot_data = data.frame('ID'=rownames(datExpr_vst), 'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.05) +
scale_x_log10() + scale_y_log10() + theme_minimal()
rm(plot_data)
Based on the last plot, I’m increasing the filtering threshold from mean expression > 0.5 to mean expression > 1 to remove the weird behaviour the lowest expressed probes create in the normalised data
Filtering probes with mean expression lower than 1
to_keep = rowMeans(datExpr)>1
datProbes = datProbes[to_keep,]
datExpr = datExpr[to_keep,]
print(paste0('Removed ', sum(!to_keep), ', ', sum(to_keep), ' remaining'))
## [1] "Removed 3081, 30112 remaining"
rm(to_keep)
save(datExpr, datMeta, datProbes, file='./../Data/Gandal/filtered_raw_data.RData')
And repeating Normalisation
counts = datExpr %>% as.matrix
rowRanges = GRanges(datProbes$chromosome_name,
IRanges(datProbes$start_position, width=datProbes$length),
strand=datProbes$strand,
feature_id=datProbes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
dds = DESeqDataSet(se, design = ~ Batch + Diagnosis_) # Including batch effect
# Perform DEA
dds = DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 145 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
DE_info = results(dds, lfcThreshold=log2(1.2), altHypothesis='greaterAbs')
# Perform vst
vsd = vst(dds)
datExpr_vst = assay(vsd)
datMeta_vst = colData(vsd)
datProbes_vst = rowRanges(vsd)
rm(counts, rowRanges, se, vsd)
This plot remains stable
meanSdPlot(datExpr_vst, plot=FALSE)$gg + theme_minimal()
This one looks better now. The valley found in the original density plot of the mean expression of the probes seems to still be present here beecause there seem to be two clouds main of points
plot_data = data.frame('ID'=rownames(datExpr_vst), 'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.05) +
scale_x_log10() + scale_y_log10() + theme_minimal()
rm(plot_data)
datExpr = datExpr_vst
datMeta = datMeta_vst %>% data.frame
datProbes = datProbes_vst
print(paste0('After filtering, the dataset consists of ', nrow(datExpr), ' probes and ', ncol(datExpr), ' samples'))
## [1] "After filtering, the dataset consists of 30112 probes and 80 samples"
rm(datExpr_vst, datMeta_vst, datProbes_vst)
By including the Batch effect in the DESeq formula we only modelled it into the DEA, but we didn’t actually correct it from the data, for that we need to use ComBat (or other equivalent package) in the already normalised data
After normalisation there seems to still be a difference in the behaviour in the first PC, but the other PC don’t seem to have a relation any more
pca = datExpr %>% t %>% prcomp
summary(pca)$importance[,1:3]
## PC1 PC2 PC3
## Standard deviation 36.82471 30.20426 21.38357
## Proportion of Variance 0.20354 0.13694 0.06863
## Cumulative Proportion 0.20354 0.34048 0.40911
plot_data = data.frame('ID'=colnames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'PC3' = pca$x[,3]) %>%
mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
mutate('Batch'=RNAExtractionBatch) %>% dplyr::select('PC1','PC2','PC3','Batch')
plot_data %>% ggpairs(progress=FALSE, aes(colour=Batch, fill=Batch, alpha=0.3)) + theme_minimal()
rm(pca, plot_data)
It seems like the Batch effect might have flipped during the normalisation
plot_data_b1 = data.frame('Mean'=colMeans(datExpr[,datMeta$RNAExtractionBatch=='10/10/2014']), 'Batch'='10/10/2014')
plot_data_b2 = data.frame('Mean'=colMeans(datExpr[,datMeta$RNAExtractionBatch=='6/20/2014']), 'Batch'='6/20/2014')
plot_data = rbind(plot_data_b1, plot_data_b2)
mu = plot_data %>% group_by(Batch) %>% dplyr::summarise(BatchMean=mean(Mean))
ggplotly(plot_data %>% ggplot(aes(x=Mean, color=Batch, fill=Batch)) + geom_density(alpha=0.3) +
geom_vline(data=mu, aes(xintercept=BatchMean, color=Batch), linetype='dashed') +
ggtitle('Mean expression by sample grouped by Batch') + scale_x_log10() + theme_minimal())
rm(plot_data_b1, plot_data_b2, plot_data)
datExpr = datExpr %>% as.matrix %>% ComBat(batch=datMeta$Batch)
## Found2batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
Plots changed although I’m not sure if it’s for better or for worse?
pca = datExpr %>% t %>% prcomp
summary(pca)$importance[,1:3]
## PC1 PC2 PC3
## Standard deviation 35.78914 29.62096 21.34238
## Proportion of Variance 0.19990 0.13694 0.07109
## Cumulative Proportion 0.19990 0.33684 0.40793
plot_data = data.frame('ID'=colnames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'PC3' = pca$x[,3]) %>%
mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
mutate('Batch'=RNAExtractionBatch) %>% dplyr::select('PC1','PC2','PC3','Batch')
plot_data %>% ggpairs(progress=FALSE, aes(colour=Batch, fill=Batch, alpha=0.3)) + theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
rm(pca, plot_data)
Both batches now have almost the same mean expression
plot_data_b1 = data.frame('Mean'=colMeans(datExpr[,datMeta$RNAExtractionBatch=='10/10/2014']), 'Batch'='10/10/2014')
plot_data_b2 = data.frame('Mean'=colMeans(datExpr[,datMeta$RNAExtractionBatch=='6/20/2014']), 'Batch'='6/20/2014')
plot_data = rbind(plot_data_b1, plot_data_b2)
mu = plot_data %>% group_by(Batch) %>% dplyr::summarise(BatchMean=mean(Mean))
ggplotly(plot_data %>% ggplot(aes(x=Mean, color=Batch, fill=Batch)) + geom_density(alpha=0.3) +
geom_vline(data=mu, aes(xintercept=BatchMean, color=Batch), linetype='dashed') +
ggtitle('Mean expression by sample grouped by Batch') + scale_x_log10() + theme_minimal())
rm(plot_data_b1, plot_data_b2, plot_data)
save(datExpr, datMeta, datProbes, DE_info, dds, file='./../Data/Gandal/preprocessed_data.RData')
# Load SFARI information
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_01-15-2019.csv')
# Get ensemble IDS for SFARI genes
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'),
values=SFARI_genes$`gene-symbol`, mart=mart) %>%
mutate('gene-symbol'=hgnc_symbol, 'ID'=as.character(ensembl_gene_id)) %>%
dplyr::select('ID', 'gene-symbol')
SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol') %>%
distinct(ID, .keep_all = TRUE)
write.csv(SFARI_genes, './../Data/SFARI/SFARI_genes_with_ensembl_IDs.csv', row.names=F)